Packages We Need

library(tidyverse)
library(skimr)
library(naniar)
library(ggplot2)
library(dplyr)
library(maps)
library(flextable)
library(plotly)
library(flexdashboard)
library(lubridate)
library(stringr)
library(kableExtra)
library(highcharter)
library(wordcloud2)
library(RColorBrewer)
library(usmap)
library(leaflet)
library(broom)
library(moments)
set.seed(1994)

Loading the Datasets

# Fast Food dataset
fastFoodData <-  read_csv("fastFoodGeo.csv")

#Census dataset
censusData <- read_csv("census_data_full_2008-2021.csv")

Merging Data Sets

## Joining two datasets
finalData <- inner_join(fastFoodData, censusData, by = "geoid")

finalData <- finalData %>% select(geoid,keys,address,city,country,name,postalCode,province,lat,long,year,population,median_income,median_monthly_rent_cost,median_monthly_home_cost, prop_female,prop_male, prop_poverty)

1. Data dictionary

# Creating data dictionary
dataDictionary <- tibble(Variable = colnames(finalData),
                         Description = c("Geographic Region ID",
                                         "Keys for identification purposes",
                                         "Address of the location",
                                         "City where the location is situated",
                                         "Country of the location",
                                         "Name of the location",
                                         "Postal code of the location",
                                         "Province or state where the location is located",
                                         "Latitude of the location",
                                         "Longitude of the location",
                                         "Year of data collection",
                                         "Population count",
                                         "Median income of the location",
                                         "Median monthly rental cost of housing",
                                         "Median monthly cost of home ownership",
                                         "Proportion of population that is female",
                                         "Proportion of population that is male",
                                         "Proportion of people 25 and older living in poverty"),
                         Type = map_chr(finalData, .f = function(x){typeof(x)[1]}),
                         Class = map_chr(finalData, .f = function(x){class(x)[1]}))

#Setting theme for flextable
set_flextable_defaults(
  font.size = 10, theme_fun = theme_vanilla,
  padding = 6,
  background.color = "#EFEFEF")

flextable::flextable(dataDictionary, cwidth = 2)

Variable

Description

Type

Class

geoid

Geographic Region ID

character

character

keys

Keys for identification purposes

character

character

address

Address of the location

character

character

city

City where the location is situated

character

character

country

Country of the location

character

character

name

Name of the location

character

character

postalCode

Postal code of the location

character

character

province

Province or state where the location is located

character

character

lat

Latitude of the location

double

numeric

long

Longitude of the location

double

numeric

year

Year of data collection

double

numeric

population

Population count

double

numeric

median_income

Median income of the location

double

numeric

median_monthly_rent_cost

Median monthly rental cost of housing

double

numeric

median_monthly_home_cost

Median monthly cost of home ownership

double

numeric

prop_female

Proportion of population that is female

double

numeric

prop_male

Proportion of population that is male

double

numeric

prop_poverty

Proportion of people 25 and older living in poverty

double

numeric

2. Data Cleaning

a. Merging Datasets

Already did above

b. Date/Time Manipulation

lubridate is primarily designed for handling dates and times, it can also be used for simple year manipulation by converting the year into a complete date object with a fixed month and day.

finalData <- finalData %>% mutate(Date = make_datetime(year = year, month = 01, day = 01))
head(finalData$Date)
## [1] "2008-01-01 UTC" "2009-01-01 UTC" "2010-01-01 UTC" "2011-01-01 UTC"
## [5] "2012-01-01 UTC" "2013-01-01 UTC"

c. String Manipulation

#1. Converting Short From States to Long Form States
# Mapping of short form to long form state names
state_mapping <- c("NY" = "New York", "OH" = "Ohio", "KY" = "Kentucky", "SC" = "South Carolina", "AR" = "Arkansas", "OK" = "Oklahoma", "IN" = "Indiana", "NC" = "North Carolina", "TN" = "Tennessee", "TX" = "Texas", "LA" = "Louisiana", "KS" = "Kansas", "ND" = "North Dakota", "UT" = "Utah", "GA" = "Georgia", "NM" = "New Mexico", "OR" = "Oregon", "HI" = "Hawaii", "VT" = "Vermont", "MI" = "Michigan", "MO" = "Missouri", "WI" = "Wisconsin", "WA" = "Washington", "MS" = "Mississippi", "NE" = "Nebraska", "ME" = "Maine", "MN" = "Minnesota", "AL" = "Alabama", "IA" = "Iowa", "WV" = "West Virginia", "AZ" = "Arizona", "SD" = "South Dakota", "WY" = "Wyoming", "IL" = "Illinois", "VA" = "Virginia", "FL" = "Florida", "CA" = "California", "MT" = "Montana", "ID" = "Idaho", "PA" = "Pennsylvania", "RI" = "Rhode Island", "NV" = "Nevada", "NJ" = "New Jersey", "MA" = "Massachusetts", "MD" = "Maryland", "CO" = "Colorado", "NH" = "New Hampshire", "CT" = "Connecticut", "AK" = "Alaska", "DE" = "Delaware", "DC" = "District of Columbia", "Co Spgs" = "Colorado Springs")

# Convert short form to long form in the finalData dataset
finalData$province <- str_replace_all(finalData$province, state_mapping)
#2. Update "Colorado Springs" to "Colorado" in the province variable
finalData$province <- str_replace(finalData$province, "Colorado Springs", "Colorado")
#3. Handling Duplicates for vendor names

# Replace "&" with "AND"
finalData$name <- str_replace(finalData$name, "&", "AND")

# Remove all non-alphanumeric characters
finalData$name <- str_replace_all(finalData$name, "\\W", "")

# Remove specific characters
finalData$name <- str_replace_all(finalData$name, "Æ", "")

# Convert to uppercase
finalData$name <- str_to_upper(finalData$name)

# Handle duplicates
finalData$name <- str_replace(finalData$name, "MCDONALDS.*", "MCDONALDS")
finalData$name <- str_replace(finalData$name, "LONGJOHNSILVERSA.*", "LONGJOHNSILVERSAANDW")
finalData$name <- str_replace(finalData$name, "AANDWLONGJOHNSILVERS", "LONGJOHNSILVERSAANDW")
finalData$name <- str_replace(finalData$name, "ARBYS.*", "ARBYS")
finalData$name <- str_replace(finalData$name, "ZAXBYS.*", "ZAXBYS")
finalData$name <- str_replace(finalData$name, "WENDYS.*", "WENDYS")
finalData$name <- str_replace(finalData$name, "QUIZNOS.*", "QUIZNOS")
finalData$name <- str_replace(finalData$name, "POPEYES.*", "POPEYES")
finalData$name <- str_replace(finalData$name, "PANDAEXPRESS.*", "PANDAEXPRESS")
finalData$name <- str_replace(finalData$name, "LITTLECAESARS.*", "LITTLECAESARS")
finalData$name <- str_replace(finalData$name, "KFCK.*", "KFC")
finalData$name <- str_replace(finalData$name, "FIVEGUYS.*", "FIVEGUYS")
finalData$name <- str_replace(finalData$name, "DAIRYQUEEN.*", "DAIRYQUEEN")
finalData$name <- str_replace(finalData$name, "CHIPOTLE.*", "CHIPOTLE")
finalData$name <- str_replace(finalData$name, "AUNTIEANNES.*", "AUNTIEANNES")
finalData$name <- str_replace(finalData$name, "CARLSJRTHE.*", "CARLSJRGREENBURRITO")

3. Exploratory Data Analysis

skim(finalData)
Data summary
Name finalData
Number of rows 229301
Number of columns 19
_______________________
Column type frequency:
character 8
numeric 10
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
geoid 0 1 5 5 0 1690 0
keys 0 1 26 109 0 9946 0
address 0 1 5 60 0 9880 0
city 0 1 3 23 0 2759 0
country 0 1 2 2 0 1 0
name 0 1 2 41 0 457 0
postalCode 0 1 4 10 0 5250 0
province 0 1 4 20 0 51 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
lat 0 1 37.49 5.05 19.65 33.96 38.34 41.14 64.84 ▁▆▇▁▁
long 0 1 -92.01 15.11 -159.38 -97.66 -87.59 -81.34 -67.45 ▁▁▃▆▇
year 0 1 2014.60 3.82 2008.00 2011.00 2015.00 2018.00 2021.00 ▆▇▅▇▇
population 0 1 905770.07 1677137.71 3688.00 112060.00 320095.00 870893.00 10170292.00 ▇▁▁▁▁
median_income 5 1 56961.81 15354.12 18972.00 46356.00 54053.00 64468.00 156821.00 ▃▇▂▁▁
median_monthly_rent_cost 0 1 925.95 273.35 245.00 729.00 869.00 1063.00 2599.00 ▃▇▂▁▁
median_monthly_home_cost 0 1 1162.50 426.09 222.00 862.00 1088.00 1401.00 3254.00 ▃▇▃▁▁
prop_female 0 1 0.51 0.01 0.32 0.50 0.51 0.52 0.63 ▁▁▅▇▁
prop_male 0 1 0.49 0.01 0.37 0.48 0.49 0.50 0.68 ▁▇▅▁▁
prop_poverty 8 1 0.15 0.05 0.02 0.11 0.14 0.17 0.47 ▃▇▂▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
Date 0 1 2008-01-01 2021-01-01 2015-01-01 14
#Out of Total 229301 observations
#5 median_income are missing - Mainly because it was was not recorded or not available for those specific locations.
#8 prop_poverty are missing - Mainly because it was was not recorded or not available for those specific locations.

#My Observation:
#Missing data can impact summary statistics by reducing sample sizes and potentially biasing the results. Visualizations may also be affected if missing values are not handled appropriately, potentially leading to incomplete or inaccurate representations of the data.

a. Tables of Summary Statistics

1. Summary statistics of median_income by province

# Calculate the summary statistics of median_income by province
summary_stats_income <- finalData %>%
  group_by(province) %>%
  summarise(mean_median_income = mean(median_income),
            median_median_income = median(median_income),
            sd_median_income = sd(median_income),
            min_median_income = min(median_income),
            max_median_income = max(median_income))

# Keep only the 5 rows with the maximum values of 'mean_median_income' for each province
summary_stats_income_5 <- summary_stats_income %>%
  slice_max(n = 5, order_by = mean_median_income)

# Round the summary statistics to two decimal places
summary_stats_income_5 <- summary_stats_income_5 %>%
  mutate(across(starts_with("mean_"):starts_with("max_"), ~round(., 2)))

# Set the caption
caption_text <- "Table 1: Summary Statistics of Median Income by Province"

# Print the table with caption using knitr::kable()
knitr::kable(summary_stats_income_5, caption = caption_text, align = "l") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Table 1: Summary Statistics of Median Income by Province
province mean_median_income median_median_income sd_median_income min_median_income max_median_income
Maryland 80214.32 81240 20091.50 35039 133267
New Jersey 75715.26 76443 15681.09 45339 124764
Alaska 75498.80 76250 7043.59 55966 90749
Massachusetts 74418.29 74698 16882.57 42290 116571
Hawaii 74139.45 73388 9600.96 46479 92600

2. Summary statistics of population by province

# Calculate the summary statistics of population by province
summary_stats_population <- finalData %>%
  group_by(province) %>%
  summarise(mean_population = round(mean(population), 0),
            median_population = round(median(population), 0),
            sd_population = round(sd(population), 0),
            min_population = round(min(population), 0),
            max_population = round(max(population), 0))

# Keep only the 5 rows with the maximum values of 'mean_population' for each province
summary_stats_population_5 <- summary_stats_population %>%
  slice_max(n = 5, order_by = mean_population)

# Set the caption
caption_text_population <- "Table 2: Summary Statistics of Population by Province"

# Print the table with caption using knitr::kable()
kable(summary_stats_population_5, caption = caption_text_population, align = "l")%>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Table 2: Summary Statistics of Population by Province
province mean_population median_population sd_population min_population max_population
California 4043664 2323892 3866682 12925 10170292
Arizona 2916351 3889161 1749812 16845 4496588
Illinois 1711499 310749 2216082 11390 5294664
Nevada 1521362 1969975 842850 15986 2292476
Texas 1461132 815996 1532290 6153 4728030

3. Summary statistics of the number of fast food restaurants by province:

# Calculate the summary statistics of the number of fast food restaurants by province
summary_stats_restaurants <- finalData %>%
  group_by(province) %>%
  summarise(total_restaurants = n())

# Keep only the rows with the maximum values of 'total_restaurants'
summary_stats_restaurants_10 <- summary_stats_restaurants %>%
  slice_max(n = 10, order_by = total_restaurants)

# Rename the columns in the summary statistics table
new_column_names_restaurants <- c("Province", "Total Restaurants")

# Set the new column names
colnames(summary_stats_restaurants_10) <- new_column_names_restaurants
# Set the caption
caption_text_restaurants <- "Table 3: Summary Statistics of Fast Food Restaurants by Province"

# Print the table with caption using knitr::kable()
kable(summary_stats_restaurants_10, caption = caption_text_restaurants, align = "l") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Table 3: Summary Statistics of Fast Food Restaurants by Province
Province Total Restaurants
California 17476
Texas 14764
Ohio 13115
Florida 12155
Illinois 8278
North Carolina 8046
Georgia 7957
Indiana 7847
Missouri 7257
Pennsylvania 6969
#This code calculates the total number of fast food restaurants in each province by using the n() function within the summarise() function. The resulting table, summary_stats_restaurants, provides an overview of the distribution of fast food restaurants across different provinces.

4. Summary statistics of the average prop_male and prop_female in each province

# Calculate the summary statistics of the average prop_male and prop_female in each province
summary_stats_gender <- finalData %>%
  group_by(province) %>%
  summarise(mean_prop_male = round(mean(prop_male), 2),
            mean_prop_female = round(mean(prop_female), 2))
# Rename the columns in the summary statistics table
new_column_names_gender <- c("Province", "Mean Proportion Male", "Mean Proportion Female")

# Set the new column names
colnames(summary_stats_gender) <- new_column_names_gender

# Set the caption
caption_text_gender <- "Table 4: Summary Statistics of Average Proportions of Male and Female in Each Province"

# Print the table with caption using knitr::kable()
kable(head(summary_stats_gender), caption = caption_text_gender, align = "l")  %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Table 4: Summary Statistics of Average Proportions of Male and Female in Each Province
Province Mean Proportion Male Mean Proportion Female
Alabama 0.48 0.52
Alaska 0.52 0.48
Arizona 0.50 0.50
Arkansas 0.49 0.51
California 0.50 0.50
Colorado 0.50 0.50

5. Summary statistics of prop_poverty by province

#The code calculates summary statistics (mean, median, standard deviation, minimum, maximum) of the proportion of people living in poverty (prop_poverty) grouped by province. The resulting table, summary_stats_poverty, provides insights into the poverty levels across different provinces.

# Calculate the summary statistics of prop_poverty by province
summary_stats_poverty <- finalData %>%
  group_by(province) %>%
  summarise(mean_prop_poverty = round(mean(prop_poverty), 2),
            median_prop_poverty = round(median(prop_poverty), 2),
            sd_prop_poverty = round(sd(prop_poverty), 2),
            min_prop_poverty = round(min(prop_poverty), 2),
            max_prop_poverty = round(max(prop_poverty), 2))

# Rename the columns in the summary statistics table
new_column_names_poverty_province <- c("Province", "Mean Proportion Poverty", "Median Proportion Poverty", "St. Deviation Poverty", "Minimum Proportion Poverty", "Maximum Proportion Poverty")

# Set the new column names
colnames(summary_stats_poverty) <- new_column_names_poverty_province

# Set the caption
caption_text_poverty <- "Table 5: Summary Statistics of Proportion of People Living in Poverty by Province"

# Print the table with caption using knitr::kable()
kable(head(summary_stats_poverty, 5), caption = caption_text_poverty, align = "l") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Table 5: Summary Statistics of Proportion of People Living in Poverty by Province
Province Mean Proportion Poverty Median Proportion Poverty St. Deviation Poverty Minimum Proportion Poverty Maximum Proportion Poverty
Alabama 0.17 0.18 0.05 0.04 0.36
Alaska 0.08 0.08 0.01 0.06 0.13
Arizona 0.16 0.16 0.03 0.10 0.33
Arkansas 0.18 0.17 0.04 0.07 0.35
California 0.15 0.14 0.04 0.05 0.30

6. Summary statistics of median_monthly_rent_cost by province

#The code calculates summary statistics (mean, median, standard deviation, minimum, maximum) of the median monthly rental cost (median_monthly_rent_cost) grouped by province. The resulting table, summary_stats_rent, provides insights into the rental cost variations across different provinces.

# Calculate the summary statistics of median_monthly_rent_cost by province
summary_stats_rent <- finalData %>%
  group_by(province) %>%
  summarise(mean_rent_cost = round(mean(median_monthly_rent_cost), 2),
            median_rent_cost = round(median(median_monthly_rent_cost), 2),
            sd_rent_cost = round(sd(median_monthly_rent_cost), 2),
            min_rent_cost = round(min(median_monthly_rent_cost), 2),
            max_rent_cost = round(max(median_monthly_rent_cost), 2))

# Rename the columns in the summary statistics table
new_column_names_rent_cost <- c("Province", "Mean Rent Cost", "Median Rent Cost", "St. Deviation Rent Cost", "Minimum Rent Cost", "Maximum Rent Cost")

# Set the new column names
colnames(summary_stats_rent) <- new_column_names_rent_cost

# Set the caption
caption_text_rent <- "Table 6: Summary Statistics of Median Monthly Rental Cost by Province"

# Print the table with caption using knitr::kable()
kable(head(summary_stats_rent, 5), caption = caption_text_rent, align = "l")  %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Table 6: Summary Statistics of Median Monthly Rental Cost by Province
Province Mean Rent Cost Median Rent Cost St. Deviation Rent Cost Minimum Rent Cost Maximum Rent Cost
Alabama 763.08 768 130.05 367 1194
Alaska 1166.21 1197 128.91 770 1350
Arizona 963.05 943 152.64 502 1389
Arkansas 712.41 713 106.91 386 1053
California 1335.34 1272 302.02 646 2599

b. Data Visualization

1. Bar Plot: Comparison of Median Income by Province

# Calculate the median income by province
median_income_by_province <- finalData %>%
  group_by(province) %>%
  summarise(median_income = median(median_income))

# Sort the data by median income in descending order
median_income_by_province <- median_income_by_province %>%
  arrange(desc(median_income))

# Create an interactive bar plot to compare median income by province
plot_median_income <- ggplot(median_income_by_province, aes(x = reorder(province, -median_income), y = median_income)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Comparison of Median Income by Province",
       x = "Province",
       y = "Median Income (USD)",
       caption = "Data Source: FinalData") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Convert the plot to an interactive plot using plotly
interactive_plot_income_median <- ggplotly(plot_median_income)

# Display the interactive plot
interactive_plot_income_median
# This bar plot compares the median income across different provinces. The provinces are ordered based on their median income in descending order. The x-axis represents the provinces, and the y-axis represents the median income in USD. The bars are filled with a steel blue color. The plot has an appropriate title, axis labels with units of measurement, and a caption indicating the data source. The x-axis labels are angled for better readability.

2. Word Cloud: Fast Food Leading Brands in the US

# Filter the data for fast food leading brands
leading_brands <- subset(finalData, !is.na(name))$name

# Generate the word frequencies
word_freq <- table(leading_brands)

# Convert the word frequencies to a data frame
word_df <- data.frame(word = names(word_freq), freq = as.numeric(word_freq))

# Sort the data frame by frequency in descending order
word_df <- word_df[order(word_df$freq, decreasing = TRUE), ]

# Limit the number of words to 100
word_df <- head(word_df, 50)

# Create the interactive word cloud
wordcloud2(word_df, size = 1.5, color = "random-dark", backgroundColor = "white")

3. Bar Plot: Top 10 Cities in the US with High Fast Food Rate

# Filter the data for fast food restaurants
fast_food_data <- finalData %>%
  filter(!is.na(city))

# Calculate the number of fast food restaurants in each city
city_counts <- fast_food_data %>%
  count(city)

# Sort the cities by the number of fast food restaurants in descending order
top_cities <- city_counts %>%
  arrange(desc(n)) %>%
  head(10)

# Create a bar plot of the top 10 cities with the highest fast food rate
bar_plot <- ggplot(top_cities, aes(x = reorder(city, -n), y = n, fill = n)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Number of Restaurants") +
  labs(title = "Top 10 Cities with High Fast Food Rate",
       x = "City",
       y = "Number of Fast Food Restaurants",
       caption = "Data Source: FinalData") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Convert the plot to an interactive plot using plotly
interactive_plot <- ggplotly(bar_plot)

# Display the interactive plot
interactive_plot

4. Line Plot: Trend of Fast Food Restaurants Over the Years

# Calculate the total number of fast food restaurants by year
restaurant_trend <- finalData %>%
  group_by(year) %>%
  summarise(total_restaurants = n())

# Create an interactive line plot to show the trend of fast food restaurants over the years
plot_restaurant_trend <- ggplot(restaurant_trend, aes(x = year, y = total_restaurants)) +
  geom_line(color = "steelblue", size = 1.5) +
  labs(title = "Trend of Fast Food Restaurants Over the Years",
       x = "Year",
       y = "Total Restaurants",
       caption = "Data Source: FinalData") +
  theme_bw()

# Convert the plot to an interactive plot using plotly
interactive_plot_restaurant_trend <- ggplotly(plot_restaurant_trend)

# Display the interactive plot
interactive_plot_restaurant_trend

5. Pie Chart:Proportions of different fast food restaurants in top 3 cities “Cincinnati”, “Las Vegas”, “Houston”

a. For Cicinnati

# Filter the data for Cincinnati
cincinnati_restaurants <- finalData %>%
  filter(city == "Cincinnati")

# Calculate the proportions of different fast food restaurants in Cincinnati
cincinnati_proportions <- cincinnati_restaurants %>%
  group_by(name) %>%
  summarise(proportion = n()) %>%
  mutate(proportion = proportion / sum(proportion))

# Create an interactive pie chart for Cincinnati without legend
interactive_pie_cincinnati <- plot_ly(cincinnati_proportions, labels = cincinnati_proportions$name, values = cincinnati_proportions$proportion, type = "pie",
                                      textposition = "outside", insidetextfont = list(color = "#FFFFFF"), hoverinfo = "label+percent",
                                      marker = list(colors = rainbow(length(cincinnati_proportions$name)), line = list(color = "#FFFFFF", width = 2))) %>%
  layout(title = "Proportions of Different Fast Food Restaurants in Cincinnati",
         showlegend = FALSE,
         annotations = list(text = "Data Source: FinalData", x = 0.5, y = -0.2, showarrow = FALSE))

# Display the interactive pie chart without legend
interactive_pie_cincinnati

b. For Las Vegas

# Filter the data for Las Vegas
lasvegas_restaurants <- finalData %>%
  filter(city == "Las Vegas")

# Calculate the proportions of different fast food restaurants in Las Vegas
lasvegas_proportions <- lasvegas_restaurants %>%
  group_by(name) %>%
  summarise(proportion = n()) %>%
  mutate(proportion = proportion / sum(proportion))

# Create an interactive pie chart for las vegas without legend
interactive_pie_lasvegas <- plot_ly(lasvegas_proportions, labels = lasvegas_proportions$name, values = lasvegas_proportions$proportion, type = "pie",
                                      textposition = "outside", insidetextfont = list(color = "#FFFFFF"), hoverinfo = "label+percent",
                                      marker = list(colors = rainbow(length(lasvegas_proportions$name)), line = list(color = "#FFFFFF", width = 2))) %>%
  layout(title = "Proportions of Different Fast Food Restaurants in Las Vegas",
         showlegend = FALSE,
         annotations = list(text = "Data Source: FinalData", x = 0.5, y = -0.2, showarrow = FALSE))

# Display the interactive pie chart without legend
interactive_pie_lasvegas

c. For Houston

# Filter the data for Houston
houston_restaurants <- finalData %>%
  filter(city == "Houston")

# Calculate the proportions of different fast food restaurants in Houston
houston_proportions <- houston_restaurants %>%
  group_by(name) %>%
  summarise(proportion = n()) %>%
  mutate(proportion = proportion / sum(proportion))

# Create an interactive pie chart for las vegas without legend
interactive_pie_houston <- plot_ly(houston_proportions, labels = houston_proportions$name, values = houston_proportions$proportion, type = "pie",
                                      textposition = "outside", insidetextfont = list(color = "#FFFFFF"), hoverinfo = "label+percent",
                                      marker = list(colors = rainbow(length(houston_proportions$name)), line = list(color = "#FFFFFF", width = 2))) %>%
  layout(title = "Proportions of Different Fast Food Restaurants in Houston",
         showlegend = FALSE,
         annotations = list(text = "Data Source: FinalData", x = 0.5, y = -0.2, showarrow = FALSE))

# Display the interactive pie chart without legend
interactive_pie_houston

6. Box Plot: Distribution of Median Income by Province

# Create the box plot with rotated x-axis labels
plot_median_income <- ggplot(finalData, aes(x = province, y = median_income)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Distribution of Median Income by Province",
       x = "Province",
       y = "Median Income",
       caption = "Data Source: FinalData") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Convert the plot to an interactive plot using plotly
interactive_plot_median_income <- ggplotly(plot_median_income)

# Display the interactive plot
interactive_plot_median_income

7. Scatter Plot: Relationship between Poverty and Fast Food Restaurants

# Calculate the proportion of people in poverty by province
poverty_data <- finalData %>%
  group_by(province) %>%
  summarise(prop_poverty = mean(prop_poverty),
            total_restaurants = n_distinct(name))

# Create an interactive scatter plot
scatter_plot <- ggplot(poverty_data, aes(x = prop_poverty, y = total_restaurants)) +
  geom_point(color = "steelblue") +
  labs(title = "Relationship between Poverty and Fast Food Restaurants",
       x = "Proportion of People in Poverty",
       y = "Total Number of Fast Food Restaurants",
       caption = "Data Source: FinalData") +
  theme_bw()

# Convert the plot to an interactive plot using plotly
interactive_plot_poverty <- ggplotly(scatter_plot)

# Display the interactive plot
interactive_plot_poverty

8. USMAP: Number of Fast Food Restaurants per Province

# Calculate the total number of fast food restaurants per province
restaurants_per_province <- finalData %>%
  group_by(province) %>%
  summarise(total_restaurants = n_distinct(name))

# Convert the province names to state abbreviations
state_abbreviations <- state.abb[match(restaurants_per_province$province, state.name)]
restaurants_per_province$state <- state_abbreviations

# Add state names to the data frame
restaurants_per_province$state_name <- state.name[match(restaurants_per_province$state, state.abb)]

# Create the plotly object
plotly_map <- plot_usmap(data = restaurants_per_province, values = "total_restaurants", color = "black") +
  labs(title = "Number of Fast Food Restaurants per Province",
       fill = "Total Restaurants",
       caption = "Data Source: FinalData") +
  scale_fill_continuous(name = "Total Restaurants") +
  theme(legend.position = "right")

# Convert the plot to an interactive plot using plotly
interactive_plot_usmap <- ggplotly(plotly_map, tooltip = c("state_name", "total_restaurants"),
                             text = c("state_name", "total_restaurants"))

# Display the interactive plot
interactive_plot_usmap

4. Monte Carlo Methods of Inference

##  A Monte Carlo simulation using the One-Way ANOVA model

## Null Hypothesis (H0): There is no difference in the average number of locations for the different restaurant names across the states in the United States.

## Alternative Hypothesis (Ha):There is a difference in the average number of locations for the different restaurant names across the states in the United States.

set.seed(1994)

# Step 1: Create a new data frame with counts of restaurant locations in each state for each restaurant name
restaurant_counts <- finalData %>%
  group_by(name, province) %>%
  summarise(num_locations = n()) %>%
  ungroup()

# Step 2: Subset to restaurants with a sufficiently large number of locations in every state
min_locations_per_state <- 5  # Set the minimum number of locations per state
restaurant_counts_subset <- restaurant_counts %>%
  group_by(name) %>%
  filter(all(num_locations >= min_locations_per_state)) %>%
  ungroup()

# Step 3: Perform One-Way ANOVA
anova_result <- aov(num_locations ~ name, data = restaurant_counts_subset)

# Step 4: Extract the F-statistic from the ANOVA result
f_statistic <- summary(anova_result)[[1]]$`F value`

# Step 5: Create a null distribution of F-statistics using Monte Carlo simulation
num_permutations <- 1000  # Increase the number of permutations
null_distribution <- numeric(num_permutations)

for (i in 1:num_permutations) {
  # Step 5a: Shuffle the num_locations randomly between different restaurant names
  shuffled_counts <- restaurant_counts %>%
    mutate(name_shuffled = sample(name)) %>%
    group_by(name_shuffled, province) %>%
    summarise(num_locations_shuffled = sum(num_locations)) %>%
    ungroup()
  
  # Step 5b: Perform One-Way ANOVA for the shuffled data and extract the F-statistic
  null_anova_result <- aov(num_locations_shuffled ~ name_shuffled, data = shuffled_counts)
  
  # Step 5c: Extract the F-statistic from the shuffled ANOVA result
  null_f_statistic <- summary(null_anova_result)[[1]]$`F value`
  
  # Step 5d: Check if the F-statistic is present in the summary (in case it's not available for some permutations)
  if (!is.null(null_f_statistic)) {
    null_distribution[i] <- null_f_statistic
  }
}

# Step 6: Remove any NA values from the null distribution
null_distribution <- null_distribution[!is.na(null_distribution)]

# Calculate the 5th percentile of the null distribution
null_5th_percentile <- quantile(null_distribution, 0.05)

# Step 7: Plot the null distribution of F-statistics
comparison_plot_f <- ggplot(data.frame(F_statistic = null_distribution), aes(x = F_statistic)) +
  geom_histogram(binwidth = 0.2, fill = alpha("lightblue", 0.7), color = "black") +
  geom_vline(xintercept = null_5th_percentile, color = "red", linetype = "dashed") +
  geom_vline(xintercept = f_statistic, color = "blue", linetype = "solid") +
  labs(x = "F-statistic",
       y = "Frequency",
       title = "Null Distribution of the F-statistic in One-Way ANOVA") +
  theme_bw()

# Convert the plot to an interactive plot using plotly
interactive_plot_one_way <- ggplotly(comparison_plot_f)

# Display the interactive plot
interactive_plot_one_way

Interpretation:

The red dashed vertical line represents the 5th percentile value of the null distribution of F-statistics.

This value is the threshold below which only 5% of the F-statistics from the random permutations fall.

It serves as the critical value for determining statistical significance.

The blue solid vertical line represents the observed F-statistic obtained from the original data.

This value is calculated from the One-Way ANOVA comparing the average number of locations for different

restaurant names across the states.

Comparing the observed F-statistic to the critical value at the 5th percentile, we draw the following conclusions:

- The observed F-statistic is greater than the critical value.

- The observed F-statistic falls in the right tail of the null distribution.

Therefore, based on the observed F-statistic and its comparison with the 5th percentile of the null distribution,

we reject the null hypothesis and conclude that there are differences in the average number of locations

for different restaurant names across the states in the United States.

5. Bootstrap Methods of Inference

# Problem Statement:
# The objective of this analysis is to estimate the median income for a specific dataset ("finalData") using the bootstrap method of inference. We want to determine the uncertainty associated with the median income estimate and construct a 95% confidence interval for the true median income. Additionally, we aim to visualize the distribution of bootstrap medians and compare it to the observed median income to assess the representatives of the observed estimate.


#Removing missing values from median_income
median_income_cleaned <- na.omit(finalData$median_income)

#Calculating the median of a bootstrap sample
bootstrap_median <- function(x) {median(sample(x, replace = TRUE))}

#setting seed
set.seed(123456)
#Generating bootstrap samples while calculating the median of them
bootstrap_median <- replicate(10000, bootstrap_median(median_income_cleaned))

#Calculating the standard error
SE <- sd(bootstrap_median)

#Calculating the 95% confidence intervals
conf_interval <- 0.95
lower_bound <- quantile(bootstrap_median, probs = (1 - conf_interval )/ 2)
upper_bound <- quantile(bootstrap_median, probs = 1 - (1 - conf_interval)/ 2)

#Generating Histogram of the distribution
data_frame <- data.frame(bootstrap_median)
hist_plot <- ggplot(data_frame, aes(x = bootstrap_median)) +
  geom_histogram(color = "black", fill = "dodgerblue") +
  geom_vline(xintercept = lower_bound, linetype = "dashed", color = "red", size = 1.5) +
  geom_vline(xintercept = upper_bound, linetype = "dashed", color = "red", size = 1.5) +
  geom_vline(xintercept = median(median_income_cleaned), color = "black", linetype = "dashed", size = 1) +
  scale_x_continuous(labels = scales::comma) +
  labs(x = "Median income",
       y = "Frequency",
       title = "Median Income Bootstrap Sample",
       subtitle = "Observed median income--dashed line")

# Convert the ggplot histogram to an interactive plot using plotly
interactive_bootstrap <- ggplotly(hist_plot)

# Display the interactive plot
interactive_bootstrap

Interpretation:

The histogram displays the distribution of bootstrap medians, generated by repeatedly sampling with replacement from the observed data and calculating the median of each bootstrap sample. The observed median income is represented by the dashed black line on the histogram, which is located at approximately 54053.

The bootstrap analysis results in a 95% confidence interval for the median income estimate. The lower bound of the confidence interval is 53979, and the upper bound is 54102. This means that we are 95% confident that the true median income in the population falls within the range of 53979 to 54102.

6. Conclusions / Main Takeaways:

Top 5 Cities With Fast Food Restaurants - Cincinnati, Las Vegas, Houston, Miami, Denver.

Top 5 most numerous fast food restaurants - McDonald’s, Taco Bell ,Subway ,Burger King, Arby’s.

The line plot depicts the trend of fast food restaurants from 2008 to 2021. The number remained stable at around 17635 from 2009 to 2019. However, in 2020, it decreased to 9946, likely due to the pandemic. In 2021, the industry rebounded strongly, reaching 17662 restaurants. This demonstrates the fast food industry’s resilience and adaptability to changing conditions.